import os
import re
import tiktoken
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import numpy as np
from scipy import stats
from scipy.stats import kruskal, mannwhitneyu
import warnings
from matplotlib.patches import Rectangle
import matplotlib.patches as mpatches
from matplotlib.gridspec import GridSpec

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

def extract_analysis_section(content):
    """Extract the ANALYSIS section from agent response content with improved parsing"""
    # Look for ANALYSIS: followed by text until REFINED_EDGES:
    pattern = r'ANALYSIS:\s*(.*?)\s*(?:REFINED_EDGES:|$)'
    match = re.search(pattern, content, re.DOTALL | re.IGNORECASE)
    
    if match:
        analysis_text = match.group(1).strip()
        return analysis_text
    else:
        # Fallback: look for analysis content between confidence and refined_edges
        lines = content.split('\n')
        analysis_lines = []
        in_analysis = False
        
        for line in lines:
            if 'ANALYSIS:' in line.upper():
                in_analysis = True
                # Get any text after ANALYSIS: on the same line
                analysis_part = line.split(':', 1)[1].strip() if ':' in line else ''
                if analysis_part:
                    analysis_lines.append(analysis_part)
                continue
            elif 'REFINED_EDGES:' in line.upper():
                break
            elif in_analysis:
                analysis_lines.append(line.strip())
        
        return ' '.join(analysis_lines).strip()

def parse_filename(filename):
    """Parse filename to extract iteration, model, and timestamp"""
    # Pattern: response_iterX_Model_Name_timestamp.txt
    pattern = r'response_iter(\d+)_(.+?)_(\d+)\.txt'
    match = re.match(pattern, filename)
    
    if match:
        iteration = int(match.group(1))
        model_raw = match.group(2)
        timestamp = match.group(3)
        
        # Clean up model name for better display
        model = model_raw.replace('_', ' ').replace('-', ' ').title()
        
        return iteration, model, timestamp
    else:
        return None, None, None

def calculate_effect_sizes(df):
    """Calculate Cohen's d effect sizes between all model pairs"""
    models = sorted(df['model'].unique())
    effect_sizes = {}
    
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model1, model2 = models[i], models[j]
            data1 = df[df['model'] == model1]['reasoning_tokens']
            data2 = df[df['model'] == model2]['reasoning_tokens']
            
            # Calculate Cohen's d
            pooled_std = np.sqrt(((len(data1) - 1) * data1.var() + 
                                 (len(data2) - 1) * data2.var()) / 
                                (len(data1) + len(data2) - 2))
            
            cohens_d = abs(data1.mean() - data2.mean()) / pooled_std
            effect_sizes[f"{model1} vs {model2}"] = cohens_d
    
    return effect_sizes

def perform_comprehensive_statistics(df):
    """Perform comprehensive statistical analysis"""
    models = sorted(df['model'].unique())
    
    # Basic descriptive statistics
    descriptive_stats = {}
    for model in models:
        data = df[df['model'] == model]['reasoning_tokens']
        descriptive_stats[model] = {
            'n': len(data),
            'mean': data.mean(),
            'std': data.std(),
            'median': data.median(),
            'q1': data.quantile(0.25),
            'q3': data.quantile(0.75),
            'min': data.min(),
            'max': data.max(),
            'skewness': stats.skew(data),
            'kurtosis': stats.kurtosis(data),
            'cv': data.std() / data.mean() if data.mean() != 0 else 0
        }
    
    # Test for normality
    normality_tests = {}
    for model in models:
        data = df[df['model'] == model]['reasoning_tokens']
        if len(data) >= 3:  # Minimum for Shapiro-Wilk
            stat, p_val = stats.shapiro(data)
            normality_tests[model] = {'statistic': stat, 'p_value': p_val}
    
    # Overall ANOVA/Kruskal-Wallis test
    model_data = [df[df['model'] == model]['reasoning_tokens'].values for model in models]
    
    # Check for equal variances
    if len(models) > 1:
        levene_stat, levene_p = stats.levene(*model_data)
        
        # Choose appropriate test
        if all(normality_tests[m]['p_value'] > 0.05 for m in models if m in normality_tests) and levene_p > 0.05:
            # Use ANOVA
            f_stat, anova_p = stats.f_oneway(*model_data)
            omnibus_test = {'test': 'ANOVA', 'statistic': f_stat, 'p_value': anova_p}
        else:
            # Use Kruskal-Wallis
            h_stat, kw_p = kruskal(*model_data)
            omnibus_test = {'test': 'Kruskal-Wallis', 'statistic': h_stat, 'p_value': kw_p}
    else:
        omnibus_test = None
    
    # Pairwise comparisons with Bonferroni correction
    pairwise_results = []
    alpha = 0.05
    n_comparisons = len(models) * (len(models) - 1) // 2
    bonferroni_alpha = alpha / n_comparisons if n_comparisons > 0 else alpha
    
    for i in range(len(models)):
        for j in range(i+1, len(models)):
            model1, model2 = models[i], models[j]
            data1 = df[df['model'] == model1]['reasoning_tokens']
            data2 = df[df['model'] == model2]['reasoning_tokens']
            
            # Use Mann-Whitney U test (non-parametric)
            stat, p_val = mannwhitneyu(data1, data2, alternative='two-sided')
            
            # Effect size (rank-biserial correlation)
            r = 1 - (2 * stat) / (len(data1) * len(data2))
            
            pairwise_results.append({
                'model1': model1,
                'model2': model2,
                'statistic': stat,
                'p_value': p_val,
                'p_corrected': min(p_val * n_comparisons, 1.0),
                'significant': p_val < bonferroni_alpha,
                'effect_size': abs(r)
            })
    
    return {
        'descriptive': descriptive_stats,
        'normality': normality_tests,
        'omnibus': omnibus_test,
        'pairwise': pairwise_results,
        'effect_sizes': calculate_effect_sizes(df)
    }

def set_publication_style():
    """Set publication-quality matplotlib style"""
    # Nature/Science journal style
    plt.style.use('seaborn-v0_8-whitegrid')
    
    plt.rcParams.update({
        'font.family': 'serif',
        'font.serif': ['Times New Roman', 'Computer Modern Roman'],
        'font.size': 10,
        'axes.titlesize': 12,
        'axes.labelsize': 11,
        'xtick.labelsize': 9,
        'ytick.labelsize': 9,
        'legend.fontsize': 9,
        'figure.titlesize': 14,
        'axes.linewidth': 1.0,
        'axes.edgecolor': 'black',
        'axes.facecolor': 'white',
        'figure.facecolor': 'white',
        'grid.alpha': 0.3,
        'grid.linewidth': 0.5,
        'lines.linewidth': 1.5,
        'patch.linewidth': 1.0,
        'xtick.major.width': 1.0,
        'ytick.major.width': 1.0,
        'xtick.minor.width': 0.5,
        'ytick.minor.width': 0.5,
        'xtick.direction': 'in',
        'ytick.direction': 'in'
    })

def create_raincloud_plot(ax, data, positions, colors, labels):
    """Create a raincloud plot (half violin + strip plot + box plot)"""
    
    for i, (pos, color, label) in enumerate(zip(positions, colors, labels)):
        # Half violin plot
        parts = ax.violinplot([data[i]], positions=[pos], showmeans=False, 
                             showmedians=False, showextrema=False, widths=0.4)
        
        for pc in parts['bodies']:
            pc.set_facecolor(color)
            pc.set_alpha(0.7)
            pc.set_edgecolor('black')
            pc.set_linewidth(0.8)
            
            # Make it half violin (right side only)
            m = np.mean(pc.get_paths()[0].vertices[:, 0])
            pc.get_paths()[0].vertices[:, 0] = np.clip(pc.get_paths()[0].vertices[:, 0], m, np.inf)
        
        # Box plot on the left
        bp = ax.boxplot([data[i]], positions=[pos - 0.2], widths=0.15,
                       patch_artist=True, manage_ticks=False,
                       boxprops=dict(facecolor=color, alpha=0.8, edgecolor='black'),
                       whiskerprops=dict(color='black', linewidth=1),
                       capprops=dict(color='black', linewidth=1),
                       medianprops=dict(color='darkred', linewidth=2),
                       flierprops=dict(marker='o', markerfacecolor=color, markersize=3, alpha=0.6))
        
        # Strip plot on the right
        jittered_x = np.random.normal(pos + 0.1, 0.02, len(data[i]))
        ax.scatter(jittered_x, data[i], alpha=0.6, s=8, color=color, edgecolor='black', linewidth=0.3)

def create_comprehensive_analysis_figure(df, stats_results):
    """Create a comprehensive multi-panel figure"""
    
    # Set up the figure with custom grid
    fig = plt.figure(figsize=(16, 12))
    gs = GridSpec(3, 3, figure=fig, hspace=0.35, wspace=0.3)
    
    # Define colors (colorblind-friendly palette)
    colors = ['#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00']
    
    models = sorted(df['model'].unique())
    model_data = [df[df['model'] == model]['reasoning_tokens'].values for model in models]
    
    # Panel A: Raincloud plot
    ax1 = fig.add_subplot(gs[0, :2])
    create_raincloud_plot(ax1, model_data, range(len(models)), colors[:len(models)], models)
    
    ax1.set_title('A. Distribution of Reasoning Tokens Across Models', fontweight='bold', pad=15)
    ax1.set_xlabel('Model')
    ax1.set_ylabel('Reasoning Tokens')
    ax1.set_xticks(range(len(models)))
    ax1.set_xticklabels(models, rotation=45, ha='right')
    ax1.grid(True, alpha=0.3, axis='y')
    
    # Add statistical annotation
    if stats_results['omnibus']:
        test_name = stats_results['omnibus']['test']
        test_stat = stats_results['omnibus']['statistic']
        p_val = stats_results['omnibus']['p_value']
        
        if p_val < 0.001:
            p_text = "p < 0.001"
        else:
            p_text = f"p = {p_val:.3f}"
        
        ax1.text(0.02, 0.98, f'{test_name}: {test_stat:.3f}, {p_text}', 
                transform=ax1.transAxes, va='top', ha='left',
                bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.8))
    
    # Panel B: Statistical summary table
    ax2 = fig.add_subplot(gs[0, 2])
    ax2.axis('off')
    
    # Create table data
    table_data = []
    headers = ['Model', 'n', 'Mean±SD', 'Median', 'IQR']
    
    for model in models:
        stats = stats_results['descriptive'][model]
        table_data.append([
            model[:10] + '...' if len(model) > 10 else model,
            f"{stats['n']}",
            f"{stats['mean']:.1f}±{stats['std']:.1f}",
            f"{stats['median']:.1f}",
            f"{stats['q1']:.1f}-{stats['q3']:.1f}"
        ])
    
    table = ax2.table(cellText=table_data, colLabels=headers,
                     cellLoc='center', loc='center',
                     colWidths=[0.25, 0.12, 0.22, 0.18, 0.23])
    table.auto_set_font_size(False)
    table.set_fontsize(8)
    table.scale(1, 1.5)
    
    # Style the table
    for i in range(len(headers)):
        table[(0, i)].set_facecolor('#E6E6E6')
        table[(0, i)].set_text_props(weight='bold')
    
    ax2.set_title('B. Descriptive Statistics', fontweight='bold', pad=15)
    
    # Panel C: Effect sizes heatmap
    ax3 = fig.add_subplot(gs[1, :])
    
    # Create pairwise comparison matrix
    n_models = len(models)
    comparison_matrix = np.zeros((n_models, n_models))
    p_value_matrix = np.zeros((n_models, n_models))
    
    for result in stats_results['pairwise']:
        i = models.index(result['model1'])
        j = models.index(result['model2'])
        comparison_matrix[i, j] = result['effect_size']
        comparison_matrix[j, i] = result['effect_size']
        p_value_matrix[i, j] = result['p_corrected']
        p_value_matrix[j, i] = result['p_corrected']
    
    # Create heatmap
    im = ax3.imshow(comparison_matrix, cmap='RdYlBu_r', aspect='equal', vmin=0, vmax=1)
    
    # Add text annotations
    for i in range(n_models):
        for j in range(n_models):
            if i != j:
                effect_size = comparison_matrix[i, j]
                p_val = p_value_matrix[i, j]
                
                # Determine significance stars
                if p_val < 0.001:
                    stars = '***'
                elif p_val < 0.01:
                    stars = '**'
                elif p_val < 0.05:
                    stars = '*'
                else:
                    stars = ''
                
                text_color = 'white' if effect_size > 0.5 else 'black'
                ax3.text(j, i, f'{effect_size:.2f}\n{stars}', 
                        ha='center', va='center', color=text_color, fontsize=8)
            else:
                ax3.text(j, i, '—', ha='center', va='center', fontsize=12)
    
    ax3.set_xticks(range(n_models))
    ax3.set_yticks(range(n_models))
    ax3.set_xticklabels([m[:8] + '...' if len(m) > 8 else m for m in models], rotation=45, ha='right')
    ax3.set_yticklabels([m[:8] + '...' if len(m) > 8 else m for m in models])
    ax3.set_title('C. Pairwise Effect Sizes (Rank-Biserial Correlation)', fontweight='bold', pad=15)
    
    # Add colorbar
    cbar = plt.colorbar(im, ax=ax3, shrink=0.8)
    cbar.set_label('Effect Size', rotation=270, labelpad=15)
    
    # Panel D: Distribution shape analysis
    ax4 = fig.add_subplot(gs[2, 0])
    
    skewness_values = [stats_results['descriptive'][model]['skewness'] for model in models]
    kurtosis_values = [stats_results['descriptive'][model]['kurtosis'] for model in models]
    
    scatter = ax4.scatter(skewness_values, kurtosis_values, 
                         c=colors[:len(models)], s=100, alpha=0.7, edgecolor='black')
    
    for i, model in enumerate(models):
        ax4.annotate(model[:6] + '...' if len(model) > 6 else model, 
                    (skewness_values[i], kurtosis_values[i]),
                    xytext=(5, 5), textcoords='offset points', fontsize=8)
    
    ax4.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    ax4.axvline(x=0, color='gray', linestyle='--', alpha=0.5)
    ax4.set_xlabel('Skewness')
    ax4.set_ylabel('Kurtosis')
    ax4.set_title('D. Distribution Shape', fontweight='bold')
    ax4.grid(True, alpha=0.3)
    
    # Panel E: Coefficient of variation
    ax5 = fig.add_subplot(gs[2, 1])
    
    cv_values = [stats_results['descriptive'][model]['cv'] for model in models]
    bars = ax5.bar(range(len(models)), cv_values, color=colors[:len(models)], 
                   alpha=0.7, edgecolor='black', linewidth=1)
    
    ax5.set_xlabel('Model')
    ax5.set_ylabel('Coefficient of Variation')
    ax5.set_title('E. Variability Comparison', fontweight='bold')
    ax5.set_xticks(range(len(models)))
    ax5.set_xticklabels([m[:6] + '...' if len(m) > 6 else m for m in models], 
                       rotation=45, ha='right')
    ax5.grid(True, alpha=0.3, axis='y')
    
    # Add value labels on bars
    for bar, cv in zip(bars, cv_values):
        height = bar.get_height()
        ax5.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{cv:.3f}', ha='center', va='bottom', fontsize=8)
    
    # Panel F: Sample size and power analysis
    ax6 = fig.add_subplot(gs[2, 2])
    
    sample_sizes = [stats_results['descriptive'][model]['n'] for model in models]
    
    # Create a simple power analysis visualization
    theta = np.linspace(0, 2*np.pi, len(models) + 1)[:-1]
    r = np.array(sample_sizes)
    r_normalized = (r - min(r)) / (max(r) - min(r)) * 0.8 + 0.2  # Scale to 0.2-1.0
    
    ax6 = plt.subplot(gs[2, 2], projection='polar')
    bars = ax6.bar(theta, r_normalized, width=2*np.pi/len(models), 
                   color=colors[:len(models)], alpha=0.7, edgecolor='black')
    
    ax6.set_xticks(theta)
    ax6.set_xticklabels([m[:4] + '...' if len(m) > 4 else m for m in models])
    ax6.set_title('F. Sample Sizes', fontweight='bold', pad=20)
    ax6.set_ylim(0, 1)
    
    # Add sample size labels
    for angle, r_val, n in zip(theta, r_normalized, sample_sizes):
        ax6.text(angle, r_val + 0.1, str(n), ha='center', va='center', fontsize=8)
    
    # Add overall title
    fig.suptitle('Comprehensive Chain-of-Thought Reasoning Analysis', 
                fontsize=16, fontweight='bold', y=0.98)
    
    # Add methodology note
    fig.text(0.02, 0.02, 
            'Statistical methods: Mann-Whitney U tests with Bonferroni correction. '
            'Effect sizes calculated as rank-biserial correlation. '
            '* p < 0.05, ** p < 0.01, *** p < 0.001',
            fontsize=8, style='italic', wrap=True)
    
    return fig

def load_data_and_create_publication_plots():
    """Load data and generate publication-quality plots"""
    
    folder_path = "/Users/acasadei/Downloads/agent_responses 2"
    
    print("🔍 Loading Chain of Thought Data...")
    
    # Initialize tokenizer
    enc = tiktoken.get_encoding("cl100k_base")
    
    # Data structures
    results = []
    
    # Process files
    try:
        files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    except FileNotFoundError:
        print(f"❌ Error: Folder '{folder_path}' not found.")
        print("Please update the folder_path variable to point to your data directory.")
        return None, None
    
    if not files:
        print(f"❌ Error: No .txt files found in '{folder_path}'.")
        return None, None
    
    print(f"📂 Found {len(files)} files to process...")
    
    for filename in files:
        filepath = os.path.join(folder_path, filename)
        
        # Parse filename
        iteration, model, timestamp = parse_filename(filename)
        
        if not all([iteration, model, timestamp]):
            continue
        
        try:
            # Read file content
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Extract analysis section
            analysis_text = extract_analysis_section(content)
            
            if not analysis_text:
                continue
            
            # Count reasoning tokens
            reasoning_tokens = len(enc.encode(analysis_text))
            
            # Store results
            results.append({
                'filename': filename,
                'iteration': iteration,
                'model': model,
                'timestamp': timestamp,
                'analysis_text': analysis_text,
                'reasoning_tokens': reasoning_tokens,
                'text_length': len(analysis_text),
                'word_count': len(analysis_text.split())
            })
            
        except Exception as e:
            print(f"⚠️  Warning: Could not process {filename}: {e}")
            continue
    
    if not results:
        print("❌ Error: No valid data found.")
        return None, None
    
    # Create DataFrame
    df = pd.DataFrame(results)
    
    print(f"✅ Loaded {len(df)} valid responses from {df['model'].nunique()} models")
    print(f"📊 Models: {', '.join(sorted(df['model'].unique()))}")
    
    # Perform comprehensive statistical analysis
    print("\n🔬 Performing comprehensive statistical analysis...")
    stats_results = perform_comprehensive_statistics(df)
    
    # Print statistical summary
    print("\n📈 Statistical Summary:")
    print("=" * 50)
    
    for model in sorted(stats_results['descriptive'].keys()):
        stats = stats_results['descriptive'][model]
        print(f"🤖 {model}:")
        print(f"   N: {stats['n']}, Mean: {stats['mean']:.1f}±{stats['std']:.1f}")
        print(f"   Median: {stats['median']:.1f}, IQR: {stats['q1']:.1f}-{stats['q3']:.1f}")
        print(f"   CV: {stats['cv']:.3f}, Skew: {stats['skewness']:.2f}")
    
    if stats_results['omnibus']:
        omnibus = stats_results['omnibus']
        print(f"\n🔍 {omnibus['test']}: {omnibus['statistic']:.3f}, p = {omnibus['p_value']:.4f}")
    
    # Set publication style
    set_publication_style()
    
    # Create comprehensive figure
    print("\n🎨 Creating comprehensive publication-quality figure...")
    fig = create_comprehensive_analysis_figure(df, stats_results)
    
    # Save figures
    output_files = [
        'comprehensive_cot_analysis.png',
        'comprehensive_cot_analysis.pdf',
        'comprehensive_cot_analysis.svg'
    ]
    
    for output_file in output_files:
        fig.savefig(output_file, dpi=600, bbox_inches='tight', 
                   facecolor='white', edgecolor='none', format=output_file.split('.')[-1])
    
    plt.show()
    
    print(f"\n✅ Publication-quality figures saved:")
    for file in output_files:
        print(f"   - {file}")
    
    # Generate statistical report
    generate_statistical_report(df, stats_results)
    
    return df, stats_results

def generate_statistical_report(df, stats_results):
    """Generate a comprehensive statistical report"""
    
    report_path = 'statistical_report.txt'
    
    with open(report_path, 'w') as f:
        f.write("COMPREHENSIVE STATISTICAL ANALYSIS REPORT\n")
        f.write("=" * 50 + "\n\n")
        
        # Dataset overview
        f.write("DATASET OVERVIEW\n")
        f.write("-" * 20 + "\n")
        f.write(f"Total observations: {len(df)}\n")
        f.write(f"Number of models: {df['model'].nunique()}\n")
        f.write(f"Models analyzed: {', '.join(sorted(df['model'].unique()))}\n")
        f.write(f"Date range: {df['timestamp'].min()} - {df['timestamp'].max()}\n\n")
        
        # Descriptive statistics
        f.write("DESCRIPTIVE STATISTICS\n")
        f.write("-" * 25 + "\n")
        
        for model in sorted(stats_results['descriptive'].keys()):
            stats = stats_results['descriptive'][model]
            f.write(f"\n{model}:\n")
            f.write(f"  Sample size (n): {stats['n']}\n")
            f.write(f"  Mean ± SD: {stats['mean']:.2f} ± {stats['std']:.2f}\n")
            f.write(f"  Median (IQR): {stats['median']:.1f} ({stats['q1']:.1f}-{stats['q3']:.1f})\n")
            f.write(f"  Range: {stats['min']:.0f} - {stats['max']:.0f}\n")
            f.write(f"  Coefficient of variation: {stats['cv']:.3f}\n")
            f.write(f"  Skewness: {stats['skewness']:.3f}\n")
            f.write(f"  Kurtosis: {stats['kurtosis']:.3f}\n")
        
        # Normality tests
        f.write(f"\n\nNORMALITY TESTS (Shapiro-Wilk)\n")
        f.write("-" * 35 + "\n")
        
        for model in sorted(stats_results['normality'].keys()):
            norm_test = stats_results['normality'][model]
            f.write(f"{model}: W = {norm_test['statistic']:.4f}, p = {norm_test['p_value']:.4f}\n")
        
        # Overall test
        if stats_results['omnibus']:
            f.write(f"\n\nOVERALL COMPARISON\n")
            f.write("-" * 20 + "\n")
            omnibus = stats_results['omnibus']
            f.write(f"{omnibus['test']}: {omnibus['statistic']:.4f}, p = {omnibus['p_value']:.6f}\n")
        
        # Pairwise comparisons
        f.write(f"\n\nPAIRWISE COMPARISONS (Bonferroni corrected)\n")
        f.write("-" * 45 + "\n")
        
        for result in stats_results['pairwise']:
            sig_symbol = "***" if result['p_corrected'] < 0.001 else \
                        "**" if result['p_corrected'] < 0.01 else \
                        "*" if result['p_corrected'] < 0.05 else "ns"
            
            f.write(f"{result['model1']} vs {result['model2']}:\n")
            f.write(f"  Mann-Whitney U = {result['statistic']:.2f}\n")
            f.write(f"  p-value (corrected) = {result['p_corrected']:.6f} {sig_symbol}\n")
            f.write(f"  Effect size (r) = {result['effect_size']:.3f}\n\n")
        
        # Effect sizes
        f.write(f"\n\nEFFECT SIZES (Cohen's d)\n")
        f.write("-" * 25 + "\n")
        
        for pair, effect_size in stats_results['effect_sizes'].items():
            if effect_size < 0.2:
                magnitude = "negligible"
            elif effect_size < 0.5:
                magnitude = "small"
            elif effect_size < 0.8:
                magnitude = "medium"
            else:
                magnitude = "large"
            
            f.write(f"{pair}: d = {effect_size:.3f} ({magnitude})\n")
        
        f.write(f"\n\nINTERPRETATION GUIDELINES\n")
        f.write("-" * 25 + "\n")
        f.write("Effect size interpretation (Cohen's d):\n")
        f.write("  d < 0.2: negligible effect\n")
        f.write("  0.2 ≤ d < 0.5: small effect\n")
        f.write("  0.5 ≤ d < 0.8: medium effect\n")
        f.write("  d ≥ 0.8: large effect\n\n")
        f.write("Significance levels:\n")
        f.write("  * p < 0.05\n")
        f.write("  ** p < 0.01\n")
        f.write("  *** p < 0.001\n")
        f.write("  ns: not significant\n\n")
        f.write("Note: All p-values are Bonferroni corrected for multiple comparisons.\n")
    
    print(f"✅ Statistical report saved: {report_path}")

if __name__ == "__main__":
    df, stats_results = load_data_and_create_publication_plots()
    if df is not None:
        print(f"\n🎯 Comprehensive publication-quality analysis complete!")
        print("📋 Files generated:")
        print("   - comprehensive_cot_analysis.png/.pdf/.svg")
        print("   - statistical_report.txt")
